import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, log_loss
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# Reading Normal.csv
type_normal=pd.read_csv('Normal.csv')
type_normal.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 | Normal |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 | Normal |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 | Normal |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 | Normal |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 | Normal |
# Reading Type_H.csv
type_h=pd.read_csv('Type_H.csv')
type_h.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 63.027817 | 22.552586 | 39.609117 | 40.475232 | 98.672917 | -0.254400 | Type_H |
| 1 | 39.056951 | 10.060991 | 25.015378 | 28.995960 | 114.405425 | 4.564259 | Type_H |
| 2 | 68.832021 | 22.218482 | 50.092194 | 46.613539 | 105.985135 | -3.530317 | Type_H |
| 3 | 69.297008 | 24.652878 | 44.311238 | 44.644130 | 101.868495 | 11.211523 | Type_H |
| 4 | 49.712859 | 9.652075 | 28.317406 | 40.060784 | 108.168725 | 7.918501 | Type_H |
# Reading Type_S.csv
type_s=pd.read_csv('Type_S.csv')
type_s.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 74.377678 | 32.053104 | 78.772013 | 42.324573 | 143.560690 | 56.125906 | Type_S |
| 1 | 89.680567 | 32.704435 | 83.130732 | 56.976132 | 129.955476 | 92.027277 | Type_S |
| 2 | 44.529051 | 9.433234 | 52.000000 | 35.095817 | 134.711772 | 29.106575 | Type_S |
| 3 | 77.690577 | 21.380645 | 64.429442 | 56.309932 | 114.818751 | 26.931841 | Type_S |
| 4 | 76.147212 | 21.936186 | 82.961502 | 54.211027 | 123.932010 | 10.431972 | Type_S |
print("Shape of normal class", type_normal.shape)
type_normal.info()
Shape of normal class (100, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 100 non-null float64 1 P_tilt 100 non-null float64 2 L_angle 100 non-null float64 3 S_slope 100 non-null float64 4 P_radius 100 non-null float64 5 S_Degree 100 non-null float64 6 Class 100 non-null object dtypes: float64(6), object(1) memory usage: 5.6+ KB
Normal class dataset contains 100 data with 7 columns.
print("Shape of type_h class", type_h.shape)
type_h.info()
Shape of type_h class (60, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 60 entries, 0 to 59 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 60 non-null float64 1 P_tilt 60 non-null float64 2 L_angle 60 non-null float64 3 S_slope 60 non-null float64 4 P_radius 60 non-null float64 5 S_Degree 60 non-null float64 6 Class 60 non-null object dtypes: float64(6), object(1) memory usage: 3.4+ KB
Type_H class dataset contains 60 data with 7 columns.
print("Shape of type_s class", type_s.shape)
type_s.info()
Shape of type_s class (150, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 150 non-null float64 1 P_tilt 150 non-null float64 2 L_angle 150 non-null float64 3 S_slope 150 non-null float64 4 P_radius 150 non-null float64 5 S_Degree 150 non-null float64 6 Class 150 non-null object dtypes: float64(6), object(1) memory usage: 8.3+ KB
Type_S class dataset contains 150 data with 7 columns.
print("Columns of Normal class Dataset:-", ", ".join(type_normal.columns))
print("Columns of Type_H class Dataset:-", ", ".join(type_h.columns))
print("Columns of Type_S class Dataset:-", ", ".join(type_s.columns))
Columns of Normal class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class Columns of Type_H class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class Columns of Type_S class Dataset:- P_incidence, P_tilt, L_angle, S_slope, P_radius, S_Degree, Class
def compare_list(l1,l2,l3):
l1.sort()
l2.sort()
l3.sort()
if l1==l2==l3:
return 'Identical columns'
return 'Non-Identical columns'
print("Column comparison for Normal, Type_H and Type_S class Dataset:-", compare_list(type_normal.columns.tolist(), type_h.columns.tolist(), type_s.columns.tolist()))
Column comparison for Normal, Type_H and Type_S class Dataset:- Identical columns
All 3 datasets have same columnName
print("DataType of Normal class Dataset")
type_normal.dtypes
DataType of Normal class Dataset
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
In Normal class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.
print("DataType of Type_H class Dataset")
type_h.dtypes
DataType of Type_H class Dataset
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
In Type_H class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.
print("DataType of Type_S class Dataset")
type_s.dtypes
DataType of Type_S class Dataset
P_incidence float64 P_tilt float64 L_angle float64 S_slope float64 P_radius float64 S_Degree float64 Class object dtype: object
In Type_S class Dataset, 6 columns i.e. P_incidence, P_tilt, L_angle, S_slope, P_radius & S_Degree have float64 as dataType and 1 column i.e. Class as object dataType.
print("Values of 'Class' feature in Normal class Dataset distributed as:-", ', '.join(type_normal.Class.unique()))
Values of 'Class' feature in Normal class Dataset distributed as:- Normal, Nrmal
'Class' feature of Normal class dataset don't have formatted values, which will be changed to all as 'normal'.
print("Values of 'Class' feature in Type_H class Dataset distributed as:-", ', '.join(type_h.Class.unique()))
Values of 'Class' feature in Type_H class Dataset distributed as:- Type_H, type_h
'Class' feature of Type_H class dataset don't have formatted values, which will be changed to all as 'type_h'.
print("Values of 'Class' feature in Type_S class Dataset distributed as:-", ', '.join(type_s.Class.unique()))
Values of 'Class' feature in Type_S class Dataset distributed as:- Type_S, tp_s
'Class' feature of Type_S class dataset don't have formatted values, which will be changed to all as 'type_s'.
# Unifying values for 'Class' feature column in all dataframes
type_normal.Class='normal'
type_s.Class='type_s'
type_h.Class='type_h'
print("Values of 'Class' feature in Normal class Dataset has been unified:-", ', '.join(type_normal.Class.unique()))
Values of 'Class' feature in Normal class Dataset has been unified:- normal
print("Values of 'Class' feature in Type_H class Dataset has been unified:-", ', '.join(type_h.Class.unique()))
Values of 'Class' feature in Type_H class Dataset has been unified:- type_h
print("Values of 'Class' feature in Type_S class Dataset has been unified:-", ', '.join(type_s.Class.unique()))
Values of 'Class' feature in Type_S class Dataset has been unified:- type_s
Values have been formatted in all dataframes and are in ready to use state.
patients=type_normal.append([type_h, type_s])
patients.reset_index(drop=True, inplace=True)
patients.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 310 entries, 0 to 309 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null object dtypes: float64(6), object(1) memory usage: 17.1+ KB
After combining all dataframes, we got resultant dataframe with 310 rows and 7 columns. Let's change Class dType from object to category, once encoded the lables.
patients.Class=patients.Class.astype('category')
patients.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 310 entries, 0 to 309 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null category dtypes: category(1), float64(6) memory usage: 15.1 KB
patients.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 | normal |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 | normal |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 | normal |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 | normal |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 | normal |
patients.sample(n=5, random_state=1)
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 78 | 74.565015 | 15.724320 | 58.618582 | 58.840695 | 105.417304 | 0.599247 | normal |
| 244 | 60.044177 | 14.309656 | 58.038865 | 45.734521 | 105.131664 | 30.409133 | type_s |
| 185 | 45.443750 | 9.906072 | 45.000000 | 35.537678 | 163.071041 | 20.315315 | type_s |
| 70 | 50.086153 | 13.430044 | 34.457541 | 36.656108 | 119.134622 | 3.089484 | normal |
| 120 | 43.922840 | 14.177959 | 37.832547 | 29.744881 | 134.461016 | 6.451648 | type_h |
Printed 5 random samples of patients dataframe with n as 5 and random_state as 1 to get same state for every sample hits.
# To check the missing data percentage
patients.isnull().mean()*100
P_incidence 0.0 P_tilt 0.0 L_angle 0.0 S_slope 0.0 P_radius 0.0 S_Degree 0.0 Class 0.0 dtype: float64
(patients.isnull().sum()/patients.shape[0])*100
P_incidence 0.0 P_tilt 0.0 L_angle 0.0 S_slope 0.0 P_radius 0.0 S_Degree 0.0 Class 0.0 dtype: float64
No missing data found, that's why 0%.
patients.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| P_incidence | 310.0 | NaN | NaN | NaN | 60.496653 | 17.23652 | 26.147921 | 46.430294 | 58.691038 | 72.877696 | 129.834041 |
| P_tilt | 310.0 | NaN | NaN | NaN | 17.542822 | 10.00833 | -6.554948 | 10.667069 | 16.357689 | 22.120395 | 49.431864 |
| L_angle | 310.0 | NaN | NaN | NaN | 51.93093 | 18.554064 | 14.0 | 37.0 | 49.562398 | 63.0 | 125.742385 |
| S_slope | 310.0 | NaN | NaN | NaN | 42.953831 | 13.423102 | 13.366931 | 33.347122 | 42.404912 | 52.695888 | 121.429566 |
| P_radius | 310.0 | NaN | NaN | NaN | 117.920655 | 13.317377 | 70.082575 | 110.709196 | 118.268178 | 125.467674 | 163.071041 |
| S_Degree | 310.0 | NaN | NaN | NaN | 26.296694 | 37.559027 | -11.058179 | 1.603727 | 11.767934 | 41.287352 | 418.543082 |
| Class | 310 | 3 | type_s | 150 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
plt.figure(figsize=(20,10))
sns.boxplot(data=patients)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
box=px.box(patients.drop(columns=['Class']), orientation='h')
box.show();
patients.mean(),patients.median()
(P_incidence 60.496653 P_tilt 17.542822 L_angle 51.930930 S_slope 42.953831 P_radius 117.920655 S_Degree 26.296694 dtype: float64, P_incidence 58.691038 P_tilt 16.357689 L_angle 49.562398 S_slope 42.404912 P_radius 118.268178 S_Degree 11.767934 dtype: float64)
Summarising 5-point summary:
patients.corr()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| P_incidence | 1.000000 | 0.629199 | 0.717282 | 0.814960 | -0.247467 | 0.638743 |
| P_tilt | 0.629199 | 1.000000 | 0.432764 | 0.062345 | 0.032668 | 0.397862 |
| L_angle | 0.717282 | 0.432764 | 1.000000 | 0.598387 | -0.080344 | 0.533667 |
| S_slope | 0.814960 | 0.062345 | 0.598387 | 1.000000 | -0.342128 | 0.523557 |
| P_radius | -0.247467 | 0.032668 | -0.080344 | -0.342128 | 1.000000 | -0.026065 |
| S_Degree | 0.638743 | 0.397862 | 0.533667 | 0.523557 | -0.026065 | 1.000000 |
plt.figure(figsize=(15,7))
sns.heatmap(patients.corr(), annot=True, cmap='YlGnBu', lw=1)
plt.yticks(fontsize=12, weight='bold', rotation=45)
plt.xticks(fontsize=12, weight='bold', rotation=45)
plt.title(label='Correlation Heatmap', fontsize=18, weight='bold')
plt.show();
P_incidence have high correlation with S_Degree, S_slope, L_angle, P_tilt.
Some data have negative correlation also.
patients.corr()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| P_incidence | 1.000000 | 0.629199 | 0.717282 | 0.814960 | -0.247467 | 0.638743 |
| P_tilt | 0.629199 | 1.000000 | 0.432764 | 0.062345 | 0.032668 | 0.397862 |
| L_angle | 0.717282 | 0.432764 | 1.000000 | 0.598387 | -0.080344 | 0.533667 |
| S_slope | 0.814960 | 0.062345 | 0.598387 | 1.000000 | -0.342128 | 0.523557 |
| P_radius | -0.247467 | 0.032668 | -0.080344 | -0.342128 | 1.000000 | -0.026065 |
| S_Degree | 0.638743 | 0.397862 | 0.533667 | 0.523557 | -0.026065 | 1.000000 |
Correlation coefficients are used to measure the strength of the relationship between two variables where values always range between -1 (strong negative relationship) and +1 (strong positive relationship). Values at or close to zero imply a weak or no linear relationship. Correlation coefficient values less than +0.8 or greater than -0.8 are not considered significant. So as per this:
plt.figure(figsize=(20,10))
sns.pairplot(patients, hue='Class', corner=True)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
<Figure size 1440x720 with 0 Axes>
plt.figure(figsize=(20,10))
sns.jointplot(data=patients, x='P_incidence', y='S_slope', kind='reg')
plt.show();
<Figure size 1440x720 with 0 Axes>
box=px.box(patients,x='P_incidence', y='Class')
box.show();
box=px.box(patients,x='P_tilt', y='Class')
box.show();
box=px.box(patients,x='L_angle', y='Class')
box.show();
box=px.box(patients,x='S_slope', y='Class')
box.show();
box=px.box(patients,x='P_radius', y='Class')
box.show();
box=px.box(patients,x='S_Degree', y='Class')
box.show();
Converting 'Class' to numerical values.
label_encoder=LabelEncoder()
patients.Class=label_encoder.fit_transform(patients.Class)
patients.Class.value_counts()
2 150 0 100 1 60 Name: Class, dtype: int64
patients.Class=patients['Class'].astype('category')
patients.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 310 entries, 0 to 309 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null category dtypes: category(1), float64(6) memory usage: 15.1 KB
# Independent Variables
X_outlier1_patient=patients.drop(columns='Class', axis=1)
# Target Variables
y_outlier1_patient=patients.Class
# Standardizing Independent Variables i.e. X
X_outlier1_patient_Scaled=X_outlier1_patient.apply(stats.zscore)
X_outlier1_patient_Scaled.describe()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| count | 3.100000e+02 | 3.100000e+02 | 3.100000e+02 | 3.100000e+02 | 3.100000e+02 | 3.100000e+02 |
| mean | 1.042177e-16 | 2.096889e-16 | 1.980495e-16 | 2.829278e-17 | -5.071212e-16 | -1.277204e-16 |
| std | 1.001617e+00 | 1.001617e+00 | 1.001617e+00 | 1.001617e+00 | 1.001617e+00 | 1.001617e+00 |
| min | -1.996010e+00 | -2.411664e+00 | -2.047652e+00 | -2.207741e+00 | -3.597963e+00 | -9.961725e-01 |
| 25% | -8.173982e-01 | -6.881138e-01 | -8.060267e-01 | -7.168418e-01 | -5.423830e-01 | -6.585073e-01 |
| 50% | -1.049246e-01 | -1.186061e-01 | -1.278621e-01 | -4.095971e-02 | 2.613767e-02 | -3.874502e-01 |
| 75% | 7.194643e-01 | 4.581158e-01 | 5.975493e-01 | 7.269414e-01 | 5.676209e-01 | 3.997679e-01 |
| max | 4.029206e+00 | 3.191402e+00 | 3.984615e+00 | 5.855771e+00 | 3.395818e+00 | 1.046035e+01 |
Independent variables are standardized using z-score, and now can be used for futher analysis.
X_outlier1_patient_train, X_outlier1_patient_test, y_outlier1_patient_train, y_outlier1_patient_test = train_test_split(X_outlier1_patient_Scaled, y_outlier1_patient, test_size=0.20, random_state=42)
X_outlier1_patient_train.shape, X_outlier1_patient_test.shape
((248, 6), (62, 6))
NNH=KNeighborsClassifier(n_neighbors=int(np.sqrt(len(patients))), metric='euclidean')
NNH.fit(X_outlier1_patient_train, y_outlier1_patient_train)
y_outlier1_patient_predicted = NNH.predict(X_outlier1_patient_test)
print('Accuracy on Training data:',NNH.score(X_outlier1_patient_train, y_outlier1_patient_train))
print('Accuracy on Test data:',NNH.score(X_outlier1_patient_test, y_outlier1_patient_test))
Accuracy on Training data: 0.7782258064516129 Accuracy on Test data: 0.8387096774193549
Training Accuracy is 78% and Testing Accuracy is 84% with n_neighbors as 17.
cm = confusion_matrix(y_outlier1_patient_test, y_outlier1_patient_predicted, labels=[0, 1, 2])
df_cm = pd.DataFrame(cm, index = [i for i in ["Normal","Type_H","Type_S"]],
columns = [i for i in ["Normal","Type_H","Type_S"]])
plt.figure(figsize=(10,7))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', lw=1)
plt.yticks(fontsize=12, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.show();
print("Classification Matrix:\n",classification_report(y_outlier1_patient_test,y_outlier1_patient_predicted))
Classification Matrix:
precision recall f1-score support
0 0.78 0.82 0.80 22
1 0.64 0.70 0.67 10
2 0.96 0.90 0.93 30
accuracy 0.84 62
macro avg 0.79 0.81 0.80 62
weighted avg 0.85 0.84 0.84 62
for k in range(1,20,2):
NNH = KNeighborsClassifier(n_neighbors= k, metric='euclidean')
NNH.fit(X_outlier1_patient_train, y_outlier1_patient_train)
y_outlier2_patient_predicted = NNH.predict(X_outlier1_patient_test)
print('K:',k)
print("Accuracy on Training data:",NNH.score(X_outlier1_patient_train, y_outlier1_patient_train))
print("Accuracy on Testing data:",NNH.score(X_outlier1_patient_test, y_outlier1_patient_test))
print("Classification Matrix:\n",classification_report(y_outlier1_patient_test,y_outlier2_patient_predicted))
K: 1
Accuracy on Training data: 1.0
Accuracy on Testing data: 0.7903225806451613
Classification Matrix:
precision recall f1-score support
0 0.71 0.77 0.74 22
1 0.64 0.70 0.67 10
2 0.93 0.83 0.88 30
accuracy 0.79 62
macro avg 0.76 0.77 0.76 62
weighted avg 0.80 0.79 0.79 62
K: 3
Accuracy on Training data: 0.9153225806451613
Accuracy on Testing data: 0.8064516129032258
Classification Matrix:
precision recall f1-score support
0 0.72 0.82 0.77 22
1 0.60 0.60 0.60 10
2 0.96 0.87 0.91 30
accuracy 0.81 62
macro avg 0.76 0.76 0.76 62
weighted avg 0.82 0.81 0.81 62
K: 5
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.79 0.86 0.83 22
1 0.70 0.70 0.70 10
2 0.93 0.87 0.90 30
accuracy 0.84 62
macro avg 0.81 0.81 0.81 62
weighted avg 0.84 0.84 0.84 62
K: 7
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.79 0.86 0.83 22
1 0.67 0.60 0.63 10
2 0.93 0.90 0.92 30
accuracy 0.84 62
macro avg 0.80 0.79 0.79 62
weighted avg 0.84 0.84 0.84 62
K: 9
Accuracy on Training data: 0.8225806451612904
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.76 0.86 0.81 22
1 0.70 0.70 0.70 10
2 0.96 0.87 0.91 30
accuracy 0.84 62
macro avg 0.81 0.81 0.81 62
weighted avg 0.85 0.84 0.84 62
K: 11
Accuracy on Training data: 0.8145161290322581
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
precision recall f1-score support
0 0.72 0.82 0.77 22
1 0.78 0.70 0.74 10
2 0.93 0.87 0.90 30
accuracy 0.82 62
macro avg 0.81 0.79 0.80 62
weighted avg 0.83 0.82 0.82 62
K: 13
Accuracy on Training data: 0.8104838709677419
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
precision recall f1-score support
0 0.79 0.86 0.83 22
1 0.73 0.80 0.76 10
2 0.96 0.87 0.91 30
accuracy 0.85 62
macro avg 0.83 0.84 0.83 62
weighted avg 0.86 0.85 0.86 62
K: 15
Accuracy on Training data: 0.7862903225806451
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
precision recall f1-score support
0 0.82 0.82 0.82 22
1 0.64 0.70 0.67 10
2 0.97 0.93 0.95 30
accuracy 0.85 62
macro avg 0.81 0.82 0.81 62
weighted avg 0.86 0.85 0.86 62
K: 17
Accuracy on Training data: 0.7782258064516129
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.78 0.82 0.80 22
1 0.64 0.70 0.67 10
2 0.96 0.90 0.93 30
accuracy 0.84 62
macro avg 0.79 0.81 0.80 62
weighted avg 0.85 0.84 0.84 62
K: 19
Accuracy on Training data: 0.7903225806451613
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
precision recall f1-score support
0 0.79 0.86 0.83 22
1 0.70 0.70 0.70 10
2 0.96 0.90 0.93 30
accuracy 0.85 62
macro avg 0.82 0.82 0.82 62
weighted avg 0.86 0.85 0.86 62
If we don't remove the outliers, k=13, makes improvement in Accuracy as 81%(training), 85%(Testing) and precision as 79%(class0), 73%(class1) and 96%(class2).
Let remove possible outliers and re-run the model to check the report.
patients_with_outliers=patients.copy(deep=True)
columns=patients.columns.drop('Class')
for col in columns:
q1,q3=np.quantile(patients[col],0.25),np.quantile(patients[col],0.75)
threshold=(q3-q1)*1.5
lower,upper=q1-threshold,q3+threshold
median=patients[col].median()
patients[col]=np.where(patients[col]>upper,median,patients[col])
patients[col]=np.where(patients[col]<lower,median,patients[col])
# some outliers will be removed
box=px.box(patients.drop(columns=['Class']), orientation='h')
box.show();
# Independent Variables
X_wo_outlier1_patient=patients.drop(columns='Class')
# Target Variables
y_wo_outlier1_patient=patients.Class
X_wo_outlier1_patient_train, X_wo_outlier1_patient_test, y_wo_outlier1_patient_train, y_wo_outlier1_patient_test = train_test_split(X_wo_outlier1_patient.apply(stats.zscore), y_wo_outlier1_patient, test_size=0.20, random_state=42)
NNH=KNeighborsClassifier(n_neighbors= int(np.sqrt(len(patients))), metric='euclidean')
NNH.fit(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train)
y_wo_outlier1_patient_predicted = NNH.predict(X_wo_outlier1_patient_test)
print('Accuracy on Training data:',NNH.score(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train))
print('Accuracy on Test data:',NNH.score(X_wo_outlier1_patient_test, y_wo_outlier1_patient_test))
print("Classification Matrix:\n",classification_report(y_wo_outlier1_patient_test,y_wo_outlier1_patient_predicted))
Accuracy on Training data: 0.8024193548387096
Accuracy on Test data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.76 0.86 0.81 22
1 0.70 0.70 0.70 10
2 0.96 0.87 0.91 30
accuracy 0.84 62
macro avg 0.81 0.81 0.81 62
weighted avg 0.85 0.84 0.84 62
After removing outliers, Training Accuracy is 80% and Testing Accuracy as 84% with n_neighbors as 17.
train_score=[]
test_score=[]
for k in range(1,20,2):
NNH = KNeighborsClassifier(n_neighbors= k, metric='euclidean')
NNH.fit(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train)
y_wo_outlier2_patient_predicted = NNH.predict(X_wo_outlier1_patient_test)
print('K:',k)
print("Accuracy on Training data:",NNH.score(X_wo_outlier1_patient_train, y_wo_outlier1_patient_train))
print("Accuracy on Testing data:",NNH.score(X_wo_outlier1_patient_test, y_wo_outlier1_patient_test))
print("Classification Matrix:\n",classification_report(y_wo_outlier1_patient_test,y_wo_outlier2_patient_predicted))
K: 1
Accuracy on Training data: 1.0
Accuracy on Testing data: 0.8709677419354839
Classification Matrix:
precision recall f1-score support
0 0.83 0.86 0.84 22
1 0.70 0.70 0.70 10
2 0.97 0.93 0.95 30
accuracy 0.87 62
macro avg 0.83 0.83 0.83 62
weighted avg 0.87 0.87 0.87 62
K: 3
Accuracy on Training data: 0.8911290322580645
Accuracy on Testing data: 0.8064516129032258
Classification Matrix:
precision recall f1-score support
0 0.76 0.73 0.74 22
1 0.54 0.70 0.61 10
2 0.96 0.90 0.93 30
accuracy 0.81 62
macro avg 0.75 0.78 0.76 62
weighted avg 0.82 0.81 0.81 62
K: 5
Accuracy on Training data: 0.8548387096774194
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.81 0.77 0.79 22
1 0.64 0.70 0.67 10
2 0.93 0.93 0.93 30
accuracy 0.84 62
macro avg 0.79 0.80 0.80 62
weighted avg 0.84 0.84 0.84 62
K: 7
Accuracy on Training data: 0.8346774193548387
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
precision recall f1-score support
0 0.77 0.77 0.77 22
1 0.58 0.70 0.64 10
2 0.96 0.90 0.93 30
accuracy 0.82 62
macro avg 0.77 0.79 0.78 62
weighted avg 0.83 0.82 0.83 62
K: 9
Accuracy on Training data: 0.8387096774193549
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.76 0.86 0.81 22
1 0.67 0.60 0.63 10
2 0.96 0.90 0.93 30
accuracy 0.84 62
macro avg 0.80 0.79 0.79 62
weighted avg 0.84 0.84 0.84 62
K: 11
Accuracy on Training data: 0.8185483870967742
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.78 0.82 0.80 22
1 0.64 0.70 0.67 10
2 0.96 0.90 0.93 30
accuracy 0.84 62
macro avg 0.79 0.81 0.80 62
weighted avg 0.85 0.84 0.84 62
K: 13
Accuracy on Training data: 0.8064516129032258
Accuracy on Testing data: 0.8548387096774194
Classification Matrix:
precision recall f1-score support
0 0.83 0.86 0.84 22
1 0.67 0.80 0.73 10
2 0.96 0.87 0.91 30
accuracy 0.85 62
macro avg 0.82 0.84 0.83 62
weighted avg 0.87 0.85 0.86 62
K: 15
Accuracy on Training data: 0.7983870967741935
Accuracy on Testing data: 0.8225806451612904
Classification Matrix:
precision recall f1-score support
0 0.76 0.86 0.81 22
1 0.60 0.60 0.60 10
2 0.96 0.87 0.91 30
accuracy 0.82 62
macro avg 0.77 0.78 0.77 62
weighted avg 0.83 0.82 0.83 62
K: 17
Accuracy on Training data: 0.8024193548387096
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.76 0.86 0.81 22
1 0.70 0.70 0.70 10
2 0.96 0.87 0.91 30
accuracy 0.84 62
macro avg 0.81 0.81 0.81 62
weighted avg 0.85 0.84 0.84 62
K: 19
Accuracy on Training data: 0.8024193548387096
Accuracy on Testing data: 0.8387096774193549
Classification Matrix:
precision recall f1-score support
0 0.79 0.86 0.83 22
1 0.64 0.70 0.67 10
2 0.96 0.87 0.91 30
accuracy 0.84 62
macro avg 0.80 0.81 0.80 62
weighted avg 0.85 0.84 0.84 62
After removing the outliers, k=13, makes improvement in Accuracy as 81%(training), 85%(Testing) and precision as 83%(class0), 67%(class1) and 96%(class2).
Used different types of method to calculate the n_neighbors, with and without outliers, where outliers weren't that much so have modelled with both perspectives. n_neighbors(K)=13, has given best result, for both with and without outliers. n_neighbors(K) parameter here is significant to improve the performance with the me
# Reading Data1.csv
data1=pd.read_csv('Data1.csv')
data1.head()
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 |
# Reading Data2.csv
data2=pd.read_csv('Data2.csv')
data2.head()
| ID | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 1 | 2 | 0 | 1 | 0 | 0 | 0 | NaN |
| 2 | 3 | 0 | 0 | 0 | 0 | 0 | NaN |
| 3 | 4 | 0 | 0 | 0 | 0 | 0 | NaN |
| 4 | 5 | 0 | 0 | 0 | 0 | 1 | NaN |
print("Shape of data1 dataset", data1.shape)
data1.info()
Shape of data1 dataset (5000, 8) <class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 CustomerSince 5000 non-null int64 3 HighestSpend 5000 non-null int64 4 ZipCode 5000 non-null int64 5 HiddenScore 5000 non-null int64 6 MonthlyAverageSpend 5000 non-null float64 7 Level 5000 non-null int64 dtypes: float64(1), int64(7) memory usage: 312.6 KB
Data1 dataset contains 5000 data with 8 columns.
print("Shape of data2 dataset", data2.shape)
data2.info()
Shape of data2 dataset (5000, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Mortgage 5000 non-null int64 2 Security 5000 non-null int64 3 FixedDepositAccount 5000 non-null int64 4 InternetBanking 5000 non-null int64 5 CreditCard 5000 non-null int64 6 LoanOnCard 4980 non-null float64 dtypes: float64(1), int64(6) memory usage: 273.6 KB
Data2 dataset contains 5000 data with 7 columns.
cust_data=pd.merge(data1,data2,on='ID')
cust_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 CustomerSince 5000 non-null int64 3 HighestSpend 5000 non-null int64 4 ZipCode 5000 non-null int64 5 HiddenScore 5000 non-null int64 6 MonthlyAverageSpend 5000 non-null float64 7 Level 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Security 5000 non-null int64 10 FixedDepositAccount 5000 non-null int64 11 InternetBanking 5000 non-null int64 12 CreditCard 5000 non-null int64 13 LoanOnCard 4980 non-null float64 dtypes: float64(2), int64(12) memory usage: 585.9 KB
Cust_data dataset contains 5000 data with 14 columns
col=['CreditCard','InternetBanking','FixedDepositAccount','Security','Level','HiddenScore']
for i in col:
cust_data[i]=cust_data[i].astype('object')
cust_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 CustomerSince 5000 non-null int64 3 HighestSpend 5000 non-null int64 4 ZipCode 5000 non-null int64 5 HiddenScore 5000 non-null object 6 MonthlyAverageSpend 5000 non-null float64 7 Level 5000 non-null object 8 Mortgage 5000 non-null int64 9 Security 5000 non-null object 10 FixedDepositAccount 5000 non-null object 11 InternetBanking 5000 non-null object 12 CreditCard 5000 non-null object 13 LoanOnCard 4980 non-null float64 dtypes: float64(2), int64(6), object(6) memory usage: 585.9+ KB
As these features contains binary values, i.e. 1&0, in future analysis we can convert same to category type also.
plt.figure(figsize=(10,7))
sns.countplot(data=cust_data,x='LoanOnCard',order=[0,1])
plt.xlabel(xlabel='LoanOnCard', fontsize=12, weight='bold')
plt.ylabel(ylabel='count', fontsize=12, weight='bold')
plt.title(label='LoanOnCard Target Variable Representation', fontsize=18, weight='bold')
plt.show();
cust_data.LoanOnCard.value_counts()
0.0 4500 1.0 480 Name: LoanOnCard, dtype: int64
cust_data.LoanOnCard.isnull().sum()
20
As we can see, our dataset contains missing values as total entries are 5000, and here classification is done for 4980(4500(0 class) & 480(1 class)) entries and 20 as missing values. So need to do some data cleansing. As LoanOnCard is target value, we can convert it to category dtype.
# To check the missing data percentage
cust_data.isnull().mean() * 100
ID 0.0 Age 0.0 CustomerSince 0.0 HighestSpend 0.0 ZipCode 0.0 HiddenScore 0.0 MonthlyAverageSpend 0.0 Level 0.0 Mortgage 0.0 Security 0.0 FixedDepositAccount 0.0 InternetBanking 0.0 CreditCard 0.0 LoanOnCard 0.4 dtype: float64
(cust_data.isnull().sum() / cust_data.shape[0]) * 100
ID 0.0 Age 0.0 CustomerSince 0.0 HighestSpend 0.0 ZipCode 0.0 HiddenScore 0.0 MonthlyAverageSpend 0.0 Level 0.0 Mortgage 0.0 Security 0.0 FixedDepositAccount 0.0 InternetBanking 0.0 CreditCard 0.0 LoanOnCard 0.4 dtype: float64
0.4% of data is missing for 'LoanOnCard' feature, which is about 20 entries. Let's remove these missing entries.
cust_data.dropna(axis=0,inplace=True)
cust_data.isnull().sum()
ID 0 Age 0 CustomerSince 0 HighestSpend 0 ZipCode 0 HiddenScore 0 MonthlyAverageSpend 0 Level 0 Mortgage 0 Security 0 FixedDepositAccount 0 InternetBanking 0 CreditCard 0 LoanOnCard 0 dtype: int64
cust_data.LoanOnCard=cust_data.LoanOnCard.astype('int')
cust_data.LoanOnCard=cust_data.LoanOnCard.astype('category')
cust_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4980 entries, 9 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 4980 non-null int64 1 Age 4980 non-null int64 2 CustomerSince 4980 non-null int64 3 HighestSpend 4980 non-null int64 4 ZipCode 4980 non-null int64 5 HiddenScore 4980 non-null object 6 MonthlyAverageSpend 4980 non-null float64 7 Level 4980 non-null object 8 Mortgage 4980 non-null int64 9 Security 4980 non-null object 10 FixedDepositAccount 4980 non-null object 11 InternetBanking 4980 non-null object 12 CreditCard 4980 non-null object 13 LoanOnCard 4980 non-null category dtypes: category(1), float64(1), int64(6), object(6) memory usage: 549.7+ KB
As missing data has been removed and LoanOnCard dtype updated to category type, let's re-visualize distribution of Target variable ‘LoanOnCard’.
plt.figure(figsize=(10,7))
sns.countplot(data=cust_data,x='LoanOnCard')
plt.xlabel(xlabel='LoanOnCard', fontsize=12, weight='bold')
plt.ylabel(ylabel='count', fontsize=12, weight='bold')
plt.title(label='LoanOnCard Target Variable Representation', fontsize=18, weight='bold')
plt.show();
fig = px.pie(cust_data, hole=0.3, values=cust_data.LoanOnCard.value_counts(), names=['Loan on Card' if i else 'No Loan on Card' for i in cust_data.LoanOnCard.value_counts().index], color_discrete_sequence=px.colors.sequential.turbid_r, title='LoanOnCard Representation')
fig.show();
90.4%(4500) people don't have loan on card and 9.64%(480) people have existing loan on card.
col=['CreditCard','InternetBanking','FixedDepositAccount','Security','Level','HiddenScore']
for i in col:
cust_data[i]=cust_data[i].astype('category')
cust_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4980 entries, 9 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 4980 non-null int64 1 Age 4980 non-null int64 2 CustomerSince 4980 non-null int64 3 HighestSpend 4980 non-null int64 4 ZipCode 4980 non-null int64 5 HiddenScore 4980 non-null category 6 MonthlyAverageSpend 4980 non-null float64 7 Level 4980 non-null category 8 Mortgage 4980 non-null int64 9 Security 4980 non-null category 10 FixedDepositAccount 4980 non-null category 11 InternetBanking 4980 non-null category 12 CreditCard 4980 non-null category 13 LoanOnCard 4980 non-null category dtypes: category(7), float64(1), int64(6) memory usage: 346.2 KB
Analyzing categorical variables.
cust_data.HiddenScore.value_counts(),cust_data.HiddenScore.value_counts().sum()
(1 1466 2 1293 4 1215 3 1006 Name: HiddenScore, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.HiddenScore.value_counts(), names=cust_data.HiddenScore.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='HiddenScore Representation')
fig.show();
Distribution of data in HiddenScore has no unexpected values, so no need of imputation.
cust_data.Level.value_counts(),cust_data.Level.value_counts().sum()
(1 2089 3 1496 2 1395 Name: Level, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.Level.value_counts(), names=cust_data.Level.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='Level Representation')
fig.show();
Distribution of data in Level has no unexpected values, so no need of imputation.
cust_data.Security.value_counts(),cust_data.Security.value_counts().sum()
(0 4460 1 520 Name: Security, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.Security.value_counts(), names=cust_data.Security.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='Security Representation')
fig.show();
Distribution of data in Security has no unexpected values, so no need of imputation.
cust_data.FixedDepositAccount.value_counts(),cust_data.FixedDepositAccount.value_counts().sum()
(0 4678 1 302 Name: FixedDepositAccount, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.FixedDepositAccount.value_counts(), names=cust_data.FixedDepositAccount.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='FixedDepositAccount Representation')
fig.show();
Distribution of data in FixedDepositAccount has no unexpected values, so no need of imputation.
cust_data.InternetBanking.value_counts(),cust_data.InternetBanking.value_counts().sum()
(1 2974 0 2006 Name: InternetBanking, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.InternetBanking.value_counts(), names=cust_data.InternetBanking.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='InternetBanking Representation')
fig.show();
Distribution of data in InternetBanking has no unexpected values, so no need of imputation.
cust_data.CreditCard.value_counts(),cust_data.CreditCard.value_counts().sum()
(0 3514 1 1466 Name: CreditCard, dtype: int64, 4980)
fig = px.pie(cust_data, hole=0.3, values=cust_data.CreditCard.value_counts(), names=cust_data.CreditCard.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='CreditCard Representation')
fig.show();
Distribution of data in CreditCard has no unexpected values, so no need of imputation.
# Calculating correlation of independent variables with the target variable.
cust_data_dup=cust_data.copy(deep=True)
cust_data_dup.HiddenScore=cust_data_dup.HiddenScore.astype('int64')
cust_data_dup.Level=cust_data_dup.Level.astype('int64')
cust_data_dup.Level=cust_data_dup.Level.astype('int64')
cust_data_dup.Security=cust_data_dup.Security.astype('int64')
cust_data_dup.FixedDepositAccount=cust_data_dup.FixedDepositAccount.astype('int64')
cust_data_dup.CreditCard=cust_data_dup.CreditCard.astype('int64')
cust_data_dup.InternetBanking=cust_data_dup.InternetBanking.astype('int64')
cust_data_dup.LoanOnCard=cust_data_dup.LoanOnCard.astype('int64')
cust_data_dup.drop(columns=['ID','ZipCode'], inplace=True, axis=1)
cust_data_dup.corr()
| Age | CustomerSince | HighestSpend | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.994208 | -0.054951 | -0.045289 | -0.051896 | 0.042750 | -0.013272 | 0.000323 | 0.007744 | 0.011227 | 0.007344 | -0.008147 |
| CustomerSince | 0.994208 | 1.000000 | -0.046092 | -0.051456 | -0.049918 | 0.014545 | -0.011380 | -0.000469 | 0.010085 | 0.011355 | 0.008779 | -0.007801 |
| HighestSpend | -0.054951 | -0.046092 | 1.000000 | -0.158357 | 0.646109 | -0.188909 | 0.207236 | -0.002284 | 0.169535 | 0.014202 | -0.002780 | 0.502626 |
| HiddenScore | -0.045289 | -0.051456 | -0.158357 | 1.000000 | -0.109180 | 0.065762 | -0.021396 | 0.019061 | 0.014327 | 0.010900 | 0.010784 | 0.061761 |
| MonthlyAverageSpend | -0.051896 | -0.049918 | 0.646109 | -0.109180 | 1.000000 | -0.137020 | 0.110275 | 0.015105 | 0.136410 | -0.003475 | -0.006577 | 0.366912 |
| Level | 0.042750 | 0.014545 | -0.188909 | 0.065762 | -0.137020 | 1.000000 | -0.032863 | -0.009443 | 0.013982 | -0.014556 | -0.011766 | 0.137010 |
| Mortgage | -0.013272 | -0.011380 | 0.207236 | -0.021396 | 0.110275 | -0.032863 | 1.000000 | -0.005002 | 0.089167 | -0.007044 | -0.007600 | 0.141947 |
| Security | 0.000323 | -0.000469 | -0.002284 | 0.019061 | 0.015105 | -0.009443 | -0.005002 | 1.000000 | 0.317673 | 0.014007 | -0.014518 | 0.021982 |
| FixedDepositAccount | 0.007744 | 0.010085 | 0.169535 | 0.014327 | 0.136410 | 0.013982 | 0.089167 | 0.317673 | 1.000000 | 0.176082 | 0.278924 | 0.316131 |
| InternetBanking | 0.011227 | 0.011355 | 0.014202 | 0.010900 | -0.003475 | -0.014556 | -0.007044 | 0.014007 | 0.176082 | 1.000000 | 0.004960 | 0.006034 |
| CreditCard | 0.007344 | 0.008779 | -0.002780 | 0.010784 | -0.006577 | -0.011766 | -0.007600 | -0.014518 | 0.278924 | 0.004960 | 1.000000 | 0.002536 |
| LoanOnCard | -0.008147 | -0.007801 | 0.502626 | 0.061761 | 0.366912 | 0.137010 | 0.141947 | 0.021982 | 0.316131 | 0.006034 | 0.002536 | 1.000000 |
Analyzing above correlation with LoanOnCard with other features, its interpreted that we can remove Age, CustomerSince, Security, InternetBanking, CreditCard, as these feature don't have any relation with LoanOnCard.
# Independent Variables
X=cust_data.drop(columns=['ID','ZipCode', 'LoanOnCard', 'Age', 'CustomerSince', 'Security', 'InternetBanking', 'CreditCard'], axis=1)
# Target Variables
y=cust_data.LoanOnCard
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
X_train.shape, X_test.shape
((3735, 6), (1245, 6))
model=LogisticRegression()
model.fit(X_train, y_train)
y_predict=model.predict(X_test)
model.coef_, model.intercept_
(array([[5.23258503e-02, 6.28135677e-01, 1.24264105e-01, 1.68758188e+00,
1.18143098e-03, 2.45440902e+00]]),
array([-13.47246279]))
print("Accuracy score Training dataset:{:.2f}".format(model.score(X_train, y_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model.score(X_test, y_test)))
Accuracy score Training dataset:0.95 Accuracy score Testing dataset:0.94
print("Log loss:{:.2f}".format(log_loss(y_test, y_predict)))
sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt='.2f')
print(classification_report(y_test,y_predict))
Log loss:2.00
precision recall f1-score support
0 0.96 0.98 0.97 1125
1 0.77 0.57 0.65 120
accuracy 0.94 1245
macro avg 0.86 0.77 0.81 1245
weighted avg 0.94 0.94 0.94 1245
cust_data.LoanOnCard.value_counts()
0 4500 1 480 Name: LoanOnCard, dtype: int64
Imbalance distribution. As having LoanOnCard is 480, let's take balanced distribution as 480 for no LoanOnCard.
cust_data_balanced=cust_data[cust_data.LoanOnCard==0].sample(n=480, random_state=42).append(cust_data[cust_data.LoanOnCard==1])
cust_data_balanced.reset_index(drop=True, inplace=True)
cust_data_balanced.LoanOnCard.value_counts()
0 480 1 480 Name: LoanOnCard, dtype: int64
Data uniformly distributed w.r.t target variable.
# Independent Variables
X_balanced = cust_data_balanced.drop(columns=['ID','ZipCode', 'LoanOnCard', 'Age', 'CustomerSince', 'Security', 'InternetBanking', 'CreditCard'], axis=1)
# Target Variables
y_balanced = cust_data_balanced.LoanOnCard
X_balanced_train, X_balanced_test, y_balanced_train, y_balanced_test = train_test_split(X_balanced, y_balanced, test_size=0.25, random_state=42, stratify=y_balanced)
X_balanced_train.shape, X_balanced_test.shape
((720, 6), (240, 6))
model=LogisticRegression()
model.fit(X_balanced_train, y_balanced_train)
y_balanced_predict=model.predict(X_balanced_test)
print("Accuracy score Training dataset:{:.2f}".format(model.score(X_balanced_train, y_balanced_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model.score(X_balanced_test, y_balanced_test)))
print("Log loss:{:.2f}".format(log_loss(y_balanced_test, y_balanced_predict)))
sns.heatmap(confusion_matrix(y_balanced_test, y_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_test,y_balanced_predict))
Accuracy score Training dataset:0.90
Accuracy score Testing dataset:0.92
Log loss:2.88
precision recall f1-score support
0 0.89 0.95 0.92 120
1 0.95 0.88 0.91 120
accuracy 0.92 240
macro avg 0.92 0.92 0.92 240
weighted avg 0.92 0.92 0.92 240
Model has been trained for 2 case:
Here we would be checking precision as we want more truePositive(TP), so:
X_balanced_Scaled=pd.concat([X_balanced.select_dtypes(include='number').apply(stats.zscore) ,X_balanced[['HiddenScore','Level','FixedDepositAccount']]], axis=1)
X_balanced_scaled_train, X_balanced_scaled_test, y_balanced_scaled_train, y_balanced_scaled_test = train_test_split(X_balanced_Scaled, y_balanced, test_size=0.25, random_state=42, stratify=y_balanced)
# SVM Modelling
model_svm=SVC(gamma=0.1, C=1)
model_svm.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_svm_balanced_predict=model_svm.predict(X_balanced_scaled_test)
print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))
sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.95
Accuracy score Testing dataset:0.94
precision recall f1-score support
0 0.95 0.93 0.94 120
1 0.93 0.95 0.94 120
accuracy 0.94 240
macro avg 0.94 0.94 0.94 240
weighted avg 0.94 0.94 0.94 240
# KNN Modelling
model_knn=KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_balanced_scaled_train))), metric='euclidean')
model_knn.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_knn_balanced_predict = model_knn.predict(X_balanced_scaled_test)
print("Value of n_neighbors(K):", int(np.sqrt(len(X_balanced_scaled_train))))
print("Accuracy score Training dataset:{:.2f}".format(model_knn.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Training dataset:{:.2f}".format(model_knn.score(X_balanced_scaled_test, y_balanced_scaled_test)))
sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_knn_balanced_predict), annot=True, fmt='.2f')
print("Classification Matrix:\n",classification_report(y_balanced_scaled_test,y_knn_balanced_predict))
Value of n_neighbors(K): 26
Accuracy score Training dataset:0.94
Accuracy score Training dataset:0.93
Classification Matrix:
precision recall f1-score support
0 0.93 0.94 0.93 120
1 0.94 0.93 0.93 120
accuracy 0.93 240
macro avg 0.93 0.93 0.93 240
weighted avg 0.93 0.93 0.93 240
Tune parameters for SVM
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True)
# fitting the model for grid search
grid.fit(X_balanced_scaled_train, y_balanced_scaled_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, gamma=0.1)
Different values for C and gamma has been used here for SVM modelling, and best accuracy was provided by c=100, gamma=0.1 and kernel='rbf' which is default kernel.
# SVM Modelling
model_svm=SVC(C=100, gamma=0.1)
model_svm.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_svm_balanced_predict=model_svm.predict(X_balanced_scaled_test)
print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))
sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.98
Accuracy score Testing dataset:0.95
precision recall f1-score support
0 0.95 0.95 0.95 120
1 0.95 0.95 0.95 120
accuracy 0.95 240
macro avg 0.95 0.95 0.95 240
weighted avg 0.95 0.95 0.95 240
Tune parameters for KNN
# defining parameter range
param_grid = {'n_neighbors': np.arange(1,int(np.sqrt(len(X_balanced_scaled_train)))).tolist(),
'p': [1,2]}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True)
# fitting the model for grid search
grid.fit(X_balanced_scaled_train, y_balanced_scaled_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
{'n_neighbors': 3, 'p': 1}
KNeighborsClassifier(n_neighbors=3, p=1)
Different K values has been compared from 1 to 26, and k=3 has resulted as best for KNN modeling with metrics as Manhattan Distance.
# KNN Modelling
model_knn1=KNeighborsClassifier(n_neighbors=3, p=1)
model_knn1.fit(X_balanced_scaled_train, y_balanced_scaled_train)
y_knn_balanced_predict1 = model_knn1.predict(X_balanced_scaled_test)
print("Accuracy score Training dataset:{:.2f}".format(model_knn1.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Training dataset:{:.2f}".format(model_knn1.score(X_balanced_scaled_test, y_balanced_scaled_test)))
sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_knn_balanced_predict1), annot=True, fmt='.2f')
print("Classification Matrix:\n",classification_report(y_balanced_scaled_test,y_knn_balanced_predict))
Accuracy score Training dataset:0.97
Accuracy score Training dataset:0.95
Classification Matrix:
precision recall f1-score support
0 0.93 0.94 0.93 120
1 0.94 0.93 0.93 120
accuracy 0.93 240
macro avg 0.93 0.93 0.93 240
weighted avg 0.93 0.93 0.93 240
As per different models trained, KNN is the best model which have given best results for testing data.
print("Accuracy score Training dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_train, y_balanced_scaled_train)))
print("Accuracy score Testing dataset:{:.2f}".format(model_svm.score(X_balanced_scaled_test, y_balanced_scaled_test)))
sns.heatmap(confusion_matrix(y_balanced_scaled_test, y_svm_balanced_predict), annot=True, fmt='.2f')
print(classification_report(y_balanced_scaled_test,y_svm_balanced_predict))
Accuracy score Training dataset:0.98
Accuracy score Testing dataset:0.95
precision recall f1-score support
0 0.95 0.95 0.95 120
1 0.95 0.95 0.95 120
accuracy 0.95 240
macro avg 0.95 0.95 0.95 240
weighted avg 0.95 0.95 0.95 240
Base model which was trained for data was LogisticRegression(balanced & unbalanced data), and have provided accuracy as 92% on testing data, whereas SVM has provided 95% on testing data.
As per me, KNN and SVM has provided best values for predicting the potential customers. Precision is highly recommend as data need to be checked highly for falseNegative(FN).